Code
library(here)
library(tidyverse)
library(broom)
library(knitr)
library(kableExtra)
library(DT)
StateNames_A <- read_csv(here('Week 9', 'Lab 9', 'StateNames_A.csv'))library(here)
library(tidyverse)
library(broom)
library(knitr)
library(kableExtra)
library(DT)
StateNames_A <- read_csv(here('Week 9', 'Lab 9', 'StateNames_A.csv'))datatable(StateNames_A)each state should be its own row
and each sex should have its own column
if there were no babies born for that combination of state & sex there should be a 0 (not an NA)
Allison_table <- StateNames_A |>
filter(Name == 'Allison') |>
rename(sex = Gender) |>
group_by(sex, State) |>
summarise(total_count = sum(Count)) |>
pivot_wider(names_from = sex,
values_from = total_count) |>
replace_na(list('M' = 0,
'F' = 0))
Allison_table |>
kable(format = "html",
col.names =
c("State",
"Females",
"Males"),
caption = "Number of Babies named 'Allison' in each State sorted by Sex")| State | Females | Males |
|---|---|---|
| AK | 232 | 0 |
| AL | 1535 | 0 |
| AR | 1198 | 0 |
| AZ | 1880 | 0 |
| CA | 12413 | 0 |
| CO | 1594 | 0 |
| CT | 1099 | 0 |
| DC | 321 | 0 |
| DE | 294 | 0 |
| FL | 4455 | 0 |
| GA | 3257 | 0 |
| HI | 183 | 0 |
| IA | 1477 | 0 |
| ID | 451 | 0 |
| IL | 5110 | 0 |
| IN | 3067 | 0 |
| KS | 1283 | 0 |
| KY | 1905 | 20 |
| LA | 1209 | 0 |
| MA | 2218 | 0 |
| MD | 2229 | 0 |
| ME | 340 | 0 |
| MI | 4014 | 0 |
| MN | 2374 | 0 |
| MO | 2882 | 0 |
| MS | 817 | 0 |
| MT | 226 | 0 |
| NC | 3435 | 0 |
| ND | 285 | 0 |
| NE | 807 | 0 |
| NH | 412 | 0 |
| NJ | 3052 | 0 |
| NM | 399 | 0 |
| NV | 729 | 0 |
| NY | 5747 | 0 |
| OH | 5487 | 0 |
| OK | 1421 | 0 |
| OR | 1186 | 0 |
| PA | 4307 | 0 |
| RI | 306 | 0 |
| SC | 1228 | 0 |
| SD | 376 | 0 |
| TN | 2488 | 0 |
| TX | 10192 | 0 |
| UT | 1125 | 0 |
| VA | 3220 | 0 |
| VT | 135 | 0 |
| WA | 1956 | 0 |
| WI | 2367 | 0 |
| WV | 813 | 0 |
| WY | 142 | 0 |
allison_f which contains only the babies assigned Female at birth.allison_f <- StateNames_A |>
rename(sex = Gender) |>
filter(Name == 'Allison', sex == 'F') |>
group_by(State) |>
replace_na(list('Count' = 0)) |>
select(Year, State, Count)
allison_f |>
kable(format = "html",
col.names =
c("Name",
"Year",
"Sex",
"State",
"Number of Babies"),
caption = "Number of Female Babies named 'Allison' per State by Year")Error in dimnames(x) <- dn: length of 'dimnames' [2] not equal to array extent
ggplot(data = allison_f, mapping = aes(x = Year, y = Count)) +
geom_jitter(alpha = 0.3, color = 'tomato') +
labs(title = "Number of Allisons in State over Time")Fit a linear model with the year as the explanatory variable, and the number of Allisons as the response. Similar to #3, each year should have one observation–the total number of Allisons born that year.
allison_lm <- lm(Count ~ Year, data = allison_f)Write out the estimated regression equation.
tidy(allison_lm)# A tibble: 2 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 3894. 1766. 2.21 0.0277
2 Year -1.88 0.880 -2.14 0.0328
Estimated Number of Allisons = 3892.55986 - 1.881963(Year)
Plot the residuals of the model, that is, the actual values minus the predicted values. Comment on the residuals - do you see any patterns?
allison_lm |>
augment() |>
ggplot(mapping = aes(y = .resid, x = .fitted)) +
geom_point()It appears that that there is not equal variance present in this model because there are clearly far more outlying positive residuals lying above the least squares line. Additionally, a pattern I can see is that the variance is decreasing as the fitted values increase, as the residual points tend to come closer to 0.
What do you conclude from this model? Is my name not cool anymore?
It is difficult to make conclusions from this model since equal variance is not present.
Aln_data <- StateNames_A |>
filter(Name == 'Allan' |
Name == 'Alan' |
Name == 'Allen') |>
rename(sex = Gender) |>
group_by(Name, Year) |>
summarise(total_count = sum(Count))
Aln_data |>
kable(format = "html",
col.names =
c("Name",
"Year",
"Number of Babies"),
caption = "Number of Babies per Spelling of 'Allen' by Year")| Name | Year | Number of Babies |
|---|---|---|
| Alan | 1997 | 2155 |
| Alan | 1998 | 2102 |
| Alan | 1999 | 2220 |
| Alan | 2000 | 2398 |
| Alan | 2001 | 2622 |
| Alan | 2002 | 2591 |
| Alan | 2003 | 3083 |
| Alan | 2004 | 3051 |
| Alan | 2005 | 3189 |
| Alan | 2006 | 3442 |
| Alan | 2007 | 3225 |
| Alan | 2008 | 3002 |
| Alan | 2009 | 2817 |
| Alan | 2010 | 2487 |
| Alan | 2011 | 2321 |
| Alan | 2012 | 2252 |
| Alan | 2013 | 2577 |
| Alan | 2014 | 2464 |
| Allan | 1997 | 430 |
| Allan | 1998 | 455 |
| Allan | 1999 | 457 |
| Allan | 2000 | 435 |
| Allan | 2001 | 441 |
| Allan | 2002 | 443 |
| Allan | 2003 | 576 |
| Allan | 2004 | 561 |
| Allan | 2005 | 509 |
| Allan | 2006 | 473 |
| Allan | 2007 | 466 |
| Allan | 2008 | 436 |
| Allan | 2009 | 436 |
| Allan | 2010 | 363 |
| Allan | 2011 | 299 |
| Allan | 2012 | 287 |
| Allan | 2013 | 328 |
| Allan | 2014 | 282 |
| Allen | 1997 | 1371 |
| Allen | 1998 | 1289 |
| Allen | 1999 | 1296 |
| Allen | 2000 | 1319 |
| Allen | 2001 | 1364 |
| Allen | 2002 | 1365 |
| Allen | 2003 | 1286 |
| Allen | 2004 | 1273 |
| Allen | 2005 | 1199 |
| Allen | 2006 | 1201 |
| Allen | 2007 | 1163 |
| Allen | 2008 | 1120 |
| Allen | 2009 | 1004 |
| Allen | 2010 | 935 |
| Allen | 2011 | 972 |
| Allen | 2012 | 924 |
| Allen | 2013 | 873 |
| Allen | 2014 | 828 |
ggplot(data = Aln_data, mapping = aes(x = Year, y = total_count)) +
geom_line() +
facet_wrap(~ Name) +
labs(title = "Total Number of Allan, Alan, and Allens over Time")each spelling should be its own column
each state should have its own row
a 0 (not an NA) should be used to represent locations where there were no instances of these names
Aln_prop <- StateNames_A |>
filter(Name %in% c('Allan', 'Alan', 'Allen'),
Year == 2000,
State %in% c('CA', 'PA')
) |>
group_by(Name, State) |>
summarize(total_count = sum(Count)) |>
group_by(State) |>
mutate(Prop = total_count / sum(total_count)) |>
select(Name, State, Prop) |>
pivot_wider(names_from = Name, values_from = Prop)
Aln_prop |>
kable(format = "html",
digits = 3,
col.names =
c("State",
"Alan",
"Allan",
"Allen"),
caption = "Proportions of Different Spellings of 'Allen' per State") |>
kable_classic(html_font = "Ariel",
font_size = 18) |>
row_spec(0, bold = TRUE)| State | Alan | Allan | Allen |
|---|---|---|---|
| CA | 0.655 | 0.147 | 0.198 |
| PA | 0.429 | 0.101 | 0.471 |